#!/usr/bin/python

import sys,os,re,optparse,shutil,glob
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
import signal
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
from matplotlib import patches
from pylab import *
from scipy.stats.stats import trim1

from scipy.ndimage import measurements
from scipy.misc import imsave
from PIL import Image
import ocrolib
from ocrolib import hocr

parser = optparse.OptionParser(usage="""
%prog [options] book/????.png

Puts together the result of OCR steps into an XHTML output file.
Uses...

book/0001.png                # page image
book/0001.bin.png            # binarized page image
book/0001.pseg.png           # page segmentation
book/0001/010001.txt         # recognizer output for lines
book/0001/010001.cseg.png    # character segmentation for lines

etc.
""")
parser.add_option("-b","--nobreaks",action="store_true", help="output line breaks")
options,args = parser.parse_args()
if len(args)==1 and os.path.isdir(args[0]):
    args = sorted(glob.glob(args[0]+"/????.png"))

def E(*args):
    sys.stderr.write(" ".join(args))
    sys.stderr.write("\n")
def P(*args):
    print "".join(args)

P(hocr.header())

for arg in args:
    base,_ = ocrolib.allsplitext(arg)
    try:
        P("<div class='ocr_page' title='file %s'>"%arg)
        if not os.path.exists(base+".pseg.png"):
            E("%s: no such file"%(base+".pseg.png",))
            continue
        if not os.path.isdir(base):
            E("%s: no such directory"%base)
            continue
        pseg = ocrolib.read_page_segmentation(base+".pseg.png")
        regions = ocrolib.RegionExtractor()
        regions.setPageLines(pseg)
        for i in range(1,regions.length()):
            id = regions.id(i)
            x0,y0,x1,y1 = regions.bboxMath(i)
            lbase = "%s/%06x"%(base,id)
            if not os.path.exists(lbase+".txt"):
                E("%s: no such file"%(lbase+".txt"))
                continue
            with open(lbase+".txt") as stream:
                text = stream.read()
            text = re.sub(r'\&','\&amp;',text)
            text = re.sub(r'\<','\&lt;',text)
            P("<span class='ocr_line' title='bbox %d %d %d %d'>"%(x0,y0,x1,y1),text,"</span>")
            if not options.nobreaks:
                P("<br />")
    finally:
        P("</div>")

P(hocr.footer())
